In [1]:
# Imports
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import glob
import csv
import calendar
import webbrowser
from datetime import datetime

# Constants
DATA_FOLDER = 'Data/'

Task 1. Compiling Ebola Data

The DATA_FOLDER/ebola folder contains summarized reports of Ebola cases from three countries (Guinea, Liberia and Sierra Leone) during the recent outbreak of the disease in West Africa. For each country, there are daily reports that contain various information about the outbreak in several cities in each country.

Use pandas to import these data files into a single Dataframe. Using this DataFrame, calculate for each country, the daily average per month of new cases and deaths. Make sure you handle all the different expressions for new cases and deaths that are used in the reports.


In [2]:
'''
Functions needed to solve task 1
'''

#function to import excel file into a dataframe
def importdata(path,date):
    allpathFiles = glob.glob(DATA_FOLDER+path+'/*.csv')
    list_data = []
    for file in allpathFiles:
        excel = pd.read_csv(file,parse_dates=[date])
        list_data.append(excel)
    return pd.concat(list_data)

#function to add the month on a new column of a DataFrame
def add_month(df):
    copy_df = df.copy()
    months = [calendar.month_name[x.month] for x in copy_df.Date]
    copy_df['Month'] = months
    return copy_df

#founction which loc only the column within a country and a specified month
#return a dataframe
def chooseCountry_month(dataframe,country,descr,month):
    df = dataframe.loc[(dataframe['Country']==country) & (dataframe['Description']==descr)]
    #df = add_month(df)
    df_month = df.loc[(df['Month']==month)]
    return df_month

# Create a dataframe with the number of death, the new cases and the daily infos for a country and a specified month 
def getmonthresults(dataframe,country,month):
    if country =='Liberia':
        descr_kill ='Total death/s in confirmed cases'
        descr_cases ='Total confirmed cases'
    if country =='Guinea':
        descr_kill ='Total deaths of confirmed'
        descr_cases ='Total cases of confirmed'
    if country == 'Sierra Leone': 
        descr_kill ='death_confirmed'
        descr_cases ='cum_confirmed'
    
    df_kill = chooseCountry_month(dataframe,country,descr_kill,month)
    df_cases = chooseCountry_month(dataframe,country,descr_cases,month)
    
    #calculate the number of new cases and of new deaths for the all month
    res_kill = int(df_kill.iloc[len(df_kill)-1].Totals)-int(df_kill.iloc[0].Totals)
    res_cases = int(df_cases.iloc[len(df_cases)-1].Totals)-int(df_cases.iloc[0].Totals)
    #calculate the number of days counted which is last day of register - first day of register
    nb_day = df_kill.iloc[len(df_kill)-1].Date.day-df_kill.iloc[0].Date.day 
    

    # Sometimes the values in the dataframe are wrong due to the excelfiles which are not all the same!
    # We then get negative results. Therefor we replace them all by NaN ! 
    if(res_cases < 0)&(res_kill <0):
        monthreport = pd.DataFrame({'New cases':[np.nan],'Deaths':[np.nan],'daily average of New cases':[np.nan],'daily average of Deaths':[np.nan],'month':[month],'Country':[country]})
    elif(res_cases >= 0) &( res_kill <0):
        monthreport = pd.DataFrame({'New cases':[res_cases],'Deaths':[np.nan],'daily average of New cases':[res_cases/nb_day],'daily average of Deaths':[np.nan],'month':[month],'Country':[country]})
    elif(res_cases < 0) & (res_kill >= 0):
        monthreport = pd.DataFrame({'New cases':[np.nan],'Deaths':[res_kill],'daily average of New cases':[np.nan],'daily average of Deaths':[res_kill/nb_day],'month':[month],'Country':[country]})
    elif(nb_day == 0):
        monthreport = pd.DataFrame({'New cases':'notEnoughdatas','Deaths':'notEnoughdatas','daily average of New cases':'notEnoughdatas','daily average of Deaths':'notEnoughdatas','month':[month],'Country':[country]})
    else:    
        monthreport = pd.DataFrame({'New cases':[res_cases],'Deaths':[res_kill],'daily average of New cases':[res_cases/nb_day],'daily average of Deaths':[res_kill/nb_day],'month':[month],'Country':[country]})
    return monthreport

#check if the  month and the country is in the dataframe df
def checkData(df,month,country):
    check = df.loc[(df['Country']==country)& (df['Month']== month)]
    return check

#return a dataframe with all the infos(daily new cases, daily death) for each month and each country
def getResults(data):
    Countries = ['Guinea','Liberia','Sierra Leone']
    Months = ['January','February','March','April','May','June','July','August','September','October','November','December']
    results=[]
    compteur =0
    for country in Countries:
        for month in Months:
            if not(checkData(data,month,country).empty) : #check if the datas for the month and country exist 
                res = getmonthresults(data,country,month)
                results.append(res)  
    return pd.concat(results)

In [3]:
# import data from guinea
path_guinea = 'Ebola/guinea_data/'
data_guinea = importdata(path_guinea,'Date')

# set the new order / change the columns / keep only the relevant datas / add the name of the country
data_guinea = data_guinea[['Date', 'Description','Totals']]
data_guinea['Country'] = ['Guinea']*len(data_guinea)

#search for New cases and death!!   
#descr(newcases): "Total cases of confirmed"  // descr(deaths): "Total deaths of confirmed"
data_guinea = data_guinea.loc[(data_guinea.Description=='Total cases of confirmed')|(data_guinea.Description=='Total deaths of confirmed')]

   
#import data from liberia
path_liberia = 'Ebola/liberia_data/'
data_liberia = importdata(path_liberia,'Date')
# set the new order / change the columns / keep only the relevant datas / add the name of the country
data_liberia = data_liberia[['Date', 'Variable','National']]
data_liberia['Country'] = ['Liberia']*len(data_liberia)

#search for New cases and death!!    
#descr(newcases): "Total confirmed cases"  // descr(deaths): "Total death/s in confirmed cases"  
data_liberia = data_liberia.loc[(data_liberia.Variable=='Total confirmed cases')|(data_liberia.Variable=='Total death/s in confirmed cases')]

#change the name of the columns to be able merge the 3 data sets
data_liberia = data_liberia.rename(columns={'Date': 'Date', 'Variable': 'Description','National':'Totals'})

    
#import data from sierra leonne
path_sl = 'Ebola/sl_data/'
data_sl = importdata(path_sl,'date')
# set the new order / change the columns / keep only the relevant datas / add the name of the country
data_sl = data_sl[['date', 'variable','National']]
data_sl['Country'] = ['Sierra Leone']*len(data_sl)

#search for new cases and death    
#descr(newcases): "cum_confirmed"  // descr(deaths): "death_confirmed"
data_sl = data_sl.loc[(data_sl.variable=='cum_confirmed')|(data_sl.variable=='death_confirmed')]
#change the name of the columns to be able merge the 3 data sets
data_sl = data_sl.rename(columns={'date': 'Date', 'variable': 'Description','National':'Totals'})


#merge the 3 dataframe into ONE which we'll apply our analysis
dataFrame = [data_guinea,data_liberia,data_sl]
data = pd.concat(dataFrame)

# Replace the NaN by 0;
data = data.fillna(0)
#add a column with the month
data = add_month(data)

#get the results from the data set -> see the function
results = getResults(data)

#print the resuults
results


Out[3]:
Country Deaths New cases daily average of Deaths daily average of New cases month
0 Guinea 109 212 4.03704 7.85185 August
0 Guinea 190 370 6.78571 13.2143 September
0 Guinea notEnoughdatas notEnoughdatas notEnoughdatas notEnoughdatas October
0 Liberia 18 31 1.38462 2.38462 June
0 Liberia 36 44 1.44 1.76 July
0 Liberia 203 237 7.80769 9.11538 August
0 Liberia 538 519 18.5517 17.8966 September
0 Liberia NaN NaN NaN NaN October
0 Liberia 0 367 0 13.1071 November
0 Liberia 0 NaN 0 NaN December
0 Sierra Leone 123 360 6.47368 18.9474 August
0 Sierra Leone 154 1069 5.31034 36.8621 September
0 Sierra Leone 538 1724 17.9333 57.4667 October
0 Sierra Leone 419 1835 14.9643 65.5357 November
0 Sierra Leone NaN 660 NaN 55 December

Task 2. RNA Sequences

In the DATA_FOLDER/microbiome subdirectory, there are 9 spreadsheets of microbiome data that was acquired from high-throughput RNA sequencing procedures, along with a 10th file that describes the content of each.

Use pandas to import the first 9 spreadsheets into a single DataFrame. Then, add the metadata information from the 10th spreadsheet as columns in the combined DataFrame. Make sure that the final DataFrame has a unique index and all the NaN values have been replaced by the tag unknown.


In [4]:
Sheet10_Meta = pd.read_excel(DATA_FOLDER +'microbiome/metadata.xls') 
allFiles = glob.glob(DATA_FOLDER + 'microbiome' + "/MID*.xls")
allFiles


Out[4]:
['Data/microbiome/MID1.xls',
 'Data/microbiome/MID2.xls',
 'Data/microbiome/MID3.xls',
 'Data/microbiome/MID4.xls',
 'Data/microbiome/MID5.xls',
 'Data/microbiome/MID6.xls',
 'Data/microbiome/MID7.xls',
 'Data/microbiome/MID8.xls',
 'Data/microbiome/MID9.xls']
  1. Creating and filling the DataFrame In order to iterate only once over the data folder, we will attach the metadata to each excel spreadsheet right after creating a DataFrame with it. This will allow the code to be shorter and clearer, but also to iterate only once on every line and therefore be more efficient.

In [5]:
#Creating an empty DataFrame to store our data and initializing a counter.
Combined_data = pd.DataFrame()
K = 0
while (K < int(len(allFiles))):
    
    #Creating a DataFrame and filling it with the excel's data
    df = pd.read_excel(allFiles[K], header=None)
    
    #Getting the metadata of the corresponding spreadsheet
    df['BARCODE'] = Sheet10_Meta.at[int(K), 'BARCODE']
    df['GROUP'] = Sheet10_Meta.at[int(K), 'GROUP']
    df['SAMPLE'] = Sheet10_Meta.at[int(K),'SAMPLE']
    
    #Append the recently created DataFrame to our combined one
    Combined_data = Combined_data.append(df)
    
    K = K + 1
    
#Renaming the columns with meaningfull names
Combined_data.columns = ['Name', 'Value','BARCODE','GROUP','SAMPLE']
Combined_data.head()


Out[5]:
Name Value BARCODE GROUP SAMPLE
0 Archaea "Crenarchaeota" Thermoprotei Desulfuro... 7 MID1 EXTRACTION CONTROL NaN
1 Archaea "Crenarchaeota" Thermoprotei Desulfuro... 2 MID1 EXTRACTION CONTROL NaN
2 Archaea "Crenarchaeota" Thermoprotei Sulfoloba... 3 MID1 EXTRACTION CONTROL NaN
3 Archaea "Crenarchaeota" Thermoprotei Thermopro... 3 MID1 EXTRACTION CONTROL NaN
4 Archaea "Euryarchaeota" "Methanomicrobia" Meth... 7 MID1 EXTRACTION CONTROL NaN
    3. Cleaning and reindexing

At first we get rid of the NaN value, we must replace them by "unknown". In order to have a more meaningful and single index, we will reset it to be the name of the RNA sequence.


In [6]:
#Replacing the NaN values with unknwown
Combined_data = Combined_data.fillna('unknown')

#Reseting the index
Combined_data = Combined_data.set_index('Name')

#Showing the result
Combined_data


Out[6]:
Value BARCODE GROUP SAMPLE
Name
Archaea "Crenarchaeota" Thermoprotei Desulfurococcales Desulfurococcaceae Ignisphaera 7 MID1 EXTRACTION CONTROL unknown
Archaea "Crenarchaeota" Thermoprotei Desulfurococcales Pyrodictiaceae Pyrolobus 2 MID1 EXTRACTION CONTROL unknown
Archaea "Crenarchaeota" Thermoprotei Sulfolobales Sulfolobaceae Stygiolobus 3 MID1 EXTRACTION CONTROL unknown
Archaea "Crenarchaeota" Thermoprotei Thermoproteales Thermofilaceae Thermofilum 3 MID1 EXTRACTION CONTROL unknown
Archaea "Euryarchaeota" "Methanomicrobia" Methanocellales Methanocellaceae Methanocella 7 MID1 EXTRACTION CONTROL unknown
Archaea "Euryarchaeota" "Methanomicrobia" Methanosarcinales Methanosarcinaceae Methanimicrococcus 1 MID1 EXTRACTION CONTROL unknown
Archaea "Euryarchaeota" "Methanomicrobia" Methanosarcinales Methermicoccaceae Methermicoccus 1 MID1 EXTRACTION CONTROL unknown
Archaea "Euryarchaeota" Archaeoglobi Archaeoglobales Archaeoglobaceae Ferroglobus 1 MID1 EXTRACTION CONTROL unknown
Archaea "Euryarchaeota" Archaeoglobi Archaeoglobales Archaeoglobaceae Geoglobus 1 MID1 EXTRACTION CONTROL unknown
Archaea "Euryarchaeota" Halobacteria Halobacteriales Halobacteriaceae Haloplanus 1 MID1 EXTRACTION CONTROL unknown
Archaea "Euryarchaeota" Halobacteria Halobacteriales Halobacteriaceae Haloquadratum 4 MID1 EXTRACTION CONTROL unknown
Archaea "Euryarchaeota" Halobacteria Halobacteriales Halobacteriaceae Halosimplex 1 MID1 EXTRACTION CONTROL unknown
Archaea "Euryarchaeota" Halobacteria Halobacteriales Halobacteriaceae Natronobacterium 2 MID1 EXTRACTION CONTROL unknown
Archaea "Euryarchaeota" Halobacteria Halobacteriales Halobacteriaceae Natronomonas 4 MID1 EXTRACTION CONTROL unknown
Archaea "Euryarchaeota" Methanococci Methanococcales Methanocaldococcaceae Methanotorris 1 MID1 EXTRACTION CONTROL unknown
Archaea "Euryarchaeota" Methanopyri Methanopyrales Methanopyraceae Methanopyrus 12 MID1 EXTRACTION CONTROL unknown
Archaea "Euryarchaeota" Thermoplasmata Thermoplasmatales Thermoplasmatales_incertae_sedis Thermogymnomonas 2 MID1 EXTRACTION CONTROL unknown
Bacteria "Actinobacteria" Actinobacteria Actinomycetales Acidothermaceae Acidothermus 2 MID1 EXTRACTION CONTROL unknown
Bacteria "Actinobacteria" Actinobacteria Actinomycetales Beutenbergiaceae Salana 1 MID1 EXTRACTION CONTROL unknown
Bacteria "Actinobacteria" Actinobacteria Actinomycetales Bogoriellaceae Bogoriella 1 MID1 EXTRACTION CONTROL unknown
Bacteria "Actinobacteria" Actinobacteria Actinomycetales Cellulomonadaceae Tropheryma 1 MID1 EXTRACTION CONTROL unknown
Bacteria "Actinobacteria" Actinobacteria Actinomycetales Corynebacteriaceae Turicella 2 MID1 EXTRACTION CONTROL unknown
Bacteria "Actinobacteria" Actinobacteria Actinomycetales Cryptosporangiaceae Cryptosporangium 53 MID1 EXTRACTION CONTROL unknown
Bacteria "Actinobacteria" Actinobacteria Actinomycetales Dermabacteraceae Dermabacter 1 MID1 EXTRACTION CONTROL unknown
Bacteria "Actinobacteria" Actinobacteria Actinomycetales Dermabacteraceae Devriesea 2 MID1 EXTRACTION CONTROL unknown
Bacteria "Actinobacteria" Actinobacteria Actinomycetales Dermatophilaceae Kineosphaera 3 MID1 EXTRACTION CONTROL unknown
Bacteria "Actinobacteria" Actinobacteria Actinomycetales Frankineae_incertae_sedis Fodinicola 1 MID1 EXTRACTION CONTROL unknown
Bacteria "Actinobacteria" Actinobacteria Actinomycetales Geodermatophilaceae Blastococcus 99 MID1 EXTRACTION CONTROL unknown
Bacteria "Actinobacteria" Actinobacteria Actinomycetales Geodermatophilaceae Geodermatophilus 17 MID1 EXTRACTION CONTROL unknown
Bacteria "Actinobacteria" Actinobacteria Actinomycetales Intrasporangiaceae Kribbia 2 MID1 EXTRACTION CONTROL unknown
... ... ... ... ...
Bacteria "Proteobacteria" Gammaproteobacteria Oceanospirillales Oceanospirillaceae Neptuniibacter 10 MID9 Control 2 stool
Bacteria "Proteobacteria" Gammaproteobacteria Oceanospirillales Oceanospirillales_incertae_sedis Spongiispira 1 MID9 Control 2 stool
Bacteria "Proteobacteria" Gammaproteobacteria Oceanospirillales Oleiphilaceae Oleiphilus 1 MID9 Control 2 stool
Bacteria "Proteobacteria" Gammaproteobacteria Pasteurellales Pasteurellaceae Haemophilus 1 MID9 Control 2 stool
Bacteria "Proteobacteria" Gammaproteobacteria Pseudomonadales Moraxellaceae Alkanindiges 12 MID9 Control 2 stool
Bacteria "Proteobacteria" Gammaproteobacteria Pseudomonadales Moraxellaceae Perlucidibaca 4 MID9 Control 2 stool
Bacteria "Proteobacteria" Gammaproteobacteria Pseudomonadales Pseudomonadaceae Azomonas 805 MID9 Control 2 stool
Bacteria "Proteobacteria" Gammaproteobacteria Pseudomonadales Pseudomonadaceae Azorhizophilus 862 MID9 Control 2 stool
Bacteria "Proteobacteria" Gammaproteobacteria Pseudomonadales Pseudomonadaceae Azotobacter 12 MID9 Control 2 stool
Bacteria "Proteobacteria" Gammaproteobacteria Pseudomonadales Pseudomonadaceae Pseudomonas 3534 MID9 Control 2 stool
Bacteria "Proteobacteria" Gammaproteobacteria Thiotrichales Piscirickettsiaceae Cycloclasticus 4 MID9 Control 2 stool
Bacteria "Proteobacteria" Gammaproteobacteria Thiotrichales Piscirickettsiaceae Sulfurivirga 1 MID9 Control 2 stool
Bacteria "Proteobacteria" Gammaproteobacteria Thiotrichales Thiotrichaceae Beggiatoa 25 MID9 Control 2 stool
Bacteria "Proteobacteria" Gammaproteobacteria Thiotrichales Thiotrichales_incertae_sedis Fangia 2 MID9 Control 2 stool
Bacteria "Proteobacteria" Gammaproteobacteria Xanthomonadales Sinobacteraceae Alkanibacter 3 MID9 Control 2 stool
Bacteria "Proteobacteria" Gammaproteobacteria Xanthomonadales Sinobacteraceae Sinobacter 2 MID9 Control 2 stool
Bacteria "Proteobacteria" Gammaproteobacteria Xanthomonadales Xanthomonadaceae Aquimonas 1 MID9 Control 2 stool
Bacteria "Proteobacteria" Gammaproteobacteria Xanthomonadales Xanthomonadaceae Frateuria 3 MID9 Control 2 stool
Bacteria "Proteobacteria" Gammaproteobacteria Xanthomonadales Xanthomonadaceae Ignatzschineria 2 MID9 Control 2 stool
Bacteria "Proteobacteria" Gammaproteobacteria Xanthomonadales Xanthomonadaceae Luteibacter 1 MID9 Control 2 stool
Bacteria "Synergistetes" Synergistia Synergistales Synergistaceae Aminomonas 3 MID9 Control 2 stool
Bacteria "Tenericutes" Mollicutes Anaeroplasmatales Anaeroplasmataceae Asteroleplasma 1 MID9 Control 2 stool
Bacteria "Tenericutes" Mollicutes Haloplasmatales Haloplasmataceae Haloplasma 11 MID9 Control 2 stool
Bacteria "Thermodesulfobacteria" Thermodesulfobacteria Thermodesulfobacteriales Thermodesulfobacteriaceae Caldimicrobium 1 MID9 Control 2 stool
Bacteria "Thermodesulfobacteria" Thermodesulfobacteria Thermodesulfobacteriales Thermodesulfobacteriaceae Thermodesulfatator 1 MID9 Control 2 stool
Bacteria "Thermotogae" Thermotogae Thermotogales Thermotogaceae Geotoga 43 MID9 Control 2 stool
Bacteria "Thermotogae" Thermotogae Thermotogales Thermotogaceae Kosmotoga 16 MID9 Control 2 stool
Bacteria "Verrucomicrobia" Opitutae Opitutales Opitutaceae Opitutus 1 MID9 Control 2 stool
Bacteria Cyanobacteria Cyanobacteria Chloroplast Chlorarachniophyceae 1 MID9 Control 2 stool
Bacteria Cyanobacteria Cyanobacteria Chloroplast Streptophyta 10 MID9 Control 2 stool

2396 rows × 4 columns

Task 3. Class War in Titanic

Use pandas to import the data file Data/titanic.xls. It contains data on all the passengers that travelled on the Titanic.

For each of the following questions state clearly your assumptions and discuss your findings:

  1. Describe the type and the value range of each attribute. Indicate and transform the attributes that can be Categorical.
  2. Plot histograms for the travel class, embarkation port, sex and age attributes. For the latter one, use discrete decade intervals.
  3. Calculate the proportion of passengers by cabin floor. Present your results in a pie chart.
  4. For each travel class, calculate the proportion of the passengers that survived. Present your results in pie charts.
  5. Calculate the proportion of the passengers that survived by travel class and sex. Present your results in a single histogram.
  6. Create 2 equally populated age categories and calculate survival proportions by age category, travel class and sex. Present your results in a DataFrame with unique index.

Question 3.1

Describe the type and the value range of each attribute. Indicate and transform the attributes that can be Categorical.

Assumptions:

- "For each exercise, please provide both a written explanation of the steps you will apply to manipulate the data, and the corresponding code." We assume that "written explanation can come in the form of commented code as well as text"
- We assume that we must not describe the value range of attributes that contain string as we dont feel the length of strings or ASCI-values don't give any insight

In [7]:
''' 
Here is a sample of the information in the titanic dataframe
''' 

# Importing titanic.xls info with Pandas
titanic = pd.read_excel('Data/titanic.xls')

# printing only the 30 first and last rows of information
print(titanic.head)


<bound method NDFrame.head of       pclass  survived                                               name  \
0          1         1                      Allen, Miss. Elisabeth Walton   
1          1         1                     Allison, Master. Hudson Trevor   
2          1         0                       Allison, Miss. Helen Loraine   
3          1         0               Allison, Mr. Hudson Joshua Creighton   
4          1         0    Allison, Mrs. Hudson J C (Bessie Waldo Daniels)   
5          1         1                                Anderson, Mr. Harry   
6          1         1                  Andrews, Miss. Kornelia Theodosia   
7          1         0                             Andrews, Mr. Thomas Jr   
8          1         1      Appleton, Mrs. Edward Dale (Charlotte Lamson)   
9          1         0                            Artagaveytia, Mr. Ramon   
10         1         0                             Astor, Col. John Jacob   
11         1         1  Astor, Mrs. John Jacob (Madeleine Talmadge Force)   
12         1         1                      Aubart, Mme. Leontine Pauline   
13         1         1                       Barber, Miss. Ellen "Nellie"   
14         1         1               Barkworth, Mr. Algernon Henry Wilson   
15         1         0                                Baumann, Mr. John D   
16         1         0                           Baxter, Mr. Quigg Edmond   
17         1         1    Baxter, Mrs. James (Helene DeLaudeniere Chaput)   
18         1         1                              Bazzani, Miss. Albina   
19         1         0                               Beattie, Mr. Thomson   
20         1         1                      Beckwith, Mr. Richard Leonard   
21         1         1   Beckwith, Mrs. Richard Leonard (Sallie Monypeny)   
22         1         1                              Behr, Mr. Karl Howell   
23         1         1                              Bidois, Miss. Rosalie   
24         1         1                                  Bird, Miss. Ellen   
25         1         0                                Birnbaum, Mr. Jakob   
26         1         1                            Bishop, Mr. Dickinson H   
27         1         1            Bishop, Mrs. Dickinson H (Helen Walton)   
28         1         1                             Bissette, Miss. Amelia   
29         1         1          Bjornstrom-Steffansson, Mr. Mauritz Hakan   
...      ...       ...                                                ...   
1279       3         0               Vestrom, Miss. Hulda Amanda Adolfina   
1280       3         0                                    Vovk, Mr. Janko   
1281       3         0                               Waelens, Mr. Achille   
1282       3         0                                Ware, Mr. Frederick   
1283       3         0                        Warren, Mr. Charles William   
1284       3         0                                  Webber, Mr. James   
1285       3         0                                Wenzel, Mr. Linhart   
1286       3         1    Whabee, Mrs. George Joseph (Shawneene Abi-Saab)   
1287       3         0                   Widegren, Mr. Carl/Charles Peter   
1288       3         0                          Wiklund, Mr. Jakob Alfred   
1289       3         0                            Wiklund, Mr. Karl Johan   
1290       3         1                   Wilkes, Mrs. James (Ellen Needs)   
1291       3         0                   Willer, Mr. Aaron ("Abi Weller")   
1292       3         0                                 Willey, Mr. Edward   
1293       3         0                  Williams, Mr. Howard Hugh "Harry"   
1294       3         0                               Williams, Mr. Leslie   
1295       3         0                                Windelov, Mr. Einar   
1296       3         0                                   Wirz, Mr. Albert   
1297       3         0                             Wiseman, Mr. Phillippe   
1298       3         0                          Wittevrongel, Mr. Camille   
1299       3         0                                Yasbeck, Mr. Antoni   
1300       3         1            Yasbeck, Mrs. Antoni (Selini Alexander)   
1301       3         0                               Youseff, Mr. Gerious   
1302       3         0                                  Yousif, Mr. Wazli   
1303       3         0                              Yousseff, Mr. Gerious   
1304       3         0                               Zabour, Miss. Hileni   
1305       3         0                              Zabour, Miss. Thamine   
1306       3         0                          Zakarian, Mr. Mapriededer   
1307       3         0                                Zakarian, Mr. Ortin   
1308       3         0                                 Zimmerman, Mr. Leo   

         sex      age  sibsp  parch            ticket      fare    cabin  \
0     female  29.0000      0      0             24160  211.3375       B5   
1       male   0.9167      1      2            113781  151.5500  C22 C26   
2     female   2.0000      1      2            113781  151.5500  C22 C26   
3       male  30.0000      1      2            113781  151.5500  C22 C26   
4     female  25.0000      1      2            113781  151.5500  C22 C26   
5       male  48.0000      0      0             19952   26.5500      E12   
6     female  63.0000      1      0             13502   77.9583       D7   
7       male  39.0000      0      0            112050    0.0000      A36   
8     female  53.0000      2      0             11769   51.4792     C101   
9       male  71.0000      0      0          PC 17609   49.5042      NaN   
10      male  47.0000      1      0          PC 17757  227.5250  C62 C64   
11    female  18.0000      1      0          PC 17757  227.5250  C62 C64   
12    female  24.0000      0      0          PC 17477   69.3000      B35   
13    female  26.0000      0      0             19877   78.8500      NaN   
14      male  80.0000      0      0             27042   30.0000      A23   
15      male      NaN      0      0          PC 17318   25.9250      NaN   
16      male  24.0000      0      1          PC 17558  247.5208  B58 B60   
17    female  50.0000      0      1          PC 17558  247.5208  B58 B60   
18    female  32.0000      0      0             11813   76.2917      D15   
19      male  36.0000      0      0             13050   75.2417       C6   
20      male  37.0000      1      1             11751   52.5542      D35   
21    female  47.0000      1      1             11751   52.5542      D35   
22      male  26.0000      0      0            111369   30.0000     C148   
23    female  42.0000      0      0          PC 17757  227.5250      NaN   
24    female  29.0000      0      0          PC 17483  221.7792      C97   
25      male  25.0000      0      0             13905   26.0000      NaN   
26      male  25.0000      1      0             11967   91.0792      B49   
27    female  19.0000      1      0             11967   91.0792      B49   
28    female  35.0000      0      0          PC 17760  135.6333      C99   
29      male  28.0000      0      0            110564   26.5500      C52   
...      ...      ...    ...    ...               ...       ...      ...   
1279  female  14.0000      0      0            350406    7.8542      NaN   
1280    male  22.0000      0      0            349252    7.8958      NaN   
1281    male  22.0000      0      0            345767    9.0000      NaN   
1282    male      NaN      0      0            359309    8.0500      NaN   
1283    male      NaN      0      0        C.A. 49867    7.5500      NaN   
1284    male      NaN      0      0  SOTON/OQ 3101316    8.0500      NaN   
1285    male  32.5000      0      0            345775    9.5000      NaN   
1286  female  38.0000      0      0              2688    7.2292      NaN   
1287    male  51.0000      0      0            347064    7.7500      NaN   
1288    male  18.0000      1      0           3101267    6.4958      NaN   
1289    male  21.0000      1      0           3101266    6.4958      NaN   
1290  female  47.0000      1      0            363272    7.0000      NaN   
1291    male      NaN      0      0              3410    8.7125      NaN   
1292    male      NaN      0      0     S.O./P.P. 751    7.5500      NaN   
1293    male      NaN      0      0          A/5 2466    8.0500      NaN   
1294    male  28.5000      0      0             54636   16.1000      NaN   
1295    male  21.0000      0      0  SOTON/OQ 3101317    7.2500      NaN   
1296    male  27.0000      0      0            315154    8.6625      NaN   
1297    male      NaN      0      0        A/4. 34244    7.2500      NaN   
1298    male  36.0000      0      0            345771    9.5000      NaN   
1299    male  27.0000      1      0              2659   14.4542      NaN   
1300  female  15.0000      1      0              2659   14.4542      NaN   
1301    male  45.5000      0      0              2628    7.2250      NaN   
1302    male      NaN      0      0              2647    7.2250      NaN   
1303    male      NaN      0      0              2627   14.4583      NaN   
1304  female  14.5000      1      0              2665   14.4542      NaN   
1305  female      NaN      1      0              2665   14.4542      NaN   
1306    male  26.5000      0      0              2656    7.2250      NaN   
1307    male  27.0000      0      0              2670    7.2250      NaN   
1308    male  29.0000      0      0            315082    7.8750      NaN   

     embarked boat   body                           home.dest  
0           S    2    NaN                        St Louis, MO  
1           S   11    NaN     Montreal, PQ / Chesterville, ON  
2           S  NaN    NaN     Montreal, PQ / Chesterville, ON  
3           S  NaN  135.0     Montreal, PQ / Chesterville, ON  
4           S  NaN    NaN     Montreal, PQ / Chesterville, ON  
5           S    3    NaN                        New York, NY  
6           S   10    NaN                          Hudson, NY  
7           S  NaN    NaN                         Belfast, NI  
8           S    D    NaN                 Bayside, Queens, NY  
9           C  NaN   22.0                 Montevideo, Uruguay  
10          C  NaN  124.0                        New York, NY  
11          C    4    NaN                        New York, NY  
12          C    9    NaN                       Paris, France  
13          S    6    NaN                                 NaN  
14          S    B    NaN                       Hessle, Yorks  
15          S  NaN    NaN                        New York, NY  
16          C  NaN    NaN                        Montreal, PQ  
17          C    6    NaN                        Montreal, PQ  
18          C    8    NaN                                 NaN  
19          C    A    NaN                        Winnipeg, MN  
20          S    5    NaN                        New York, NY  
21          S    5    NaN                        New York, NY  
22          C    5    NaN                        New York, NY  
23          C    4    NaN                                 NaN  
24          S    8    NaN                                 NaN  
25          C  NaN  148.0                   San Francisco, CA  
26          C    7    NaN                        Dowagiac, MI  
27          C    7    NaN                        Dowagiac, MI  
28          S    8    NaN                                 NaN  
29          S    D    NaN  Stockholm, Sweden / Washington, DC  
...       ...  ...    ...                                 ...  
1279        S  NaN    NaN                                 NaN  
1280        S  NaN    NaN                                 NaN  
1281        S  NaN    NaN      Antwerp, Belgium / Stanton, OH  
1282        S  NaN    NaN                                 NaN  
1283        S  NaN    NaN                                 NaN  
1284        S  NaN    NaN                                 NaN  
1285        S  NaN  298.0                                 NaN  
1286        C    C    NaN                                 NaN  
1287        S  NaN    NaN                                 NaN  
1288        S  NaN  314.0                                 NaN  
1289        S  NaN    NaN                                 NaN  
1290        S  NaN    NaN                                 NaN  
1291        S  NaN    NaN                                 NaN  
1292        S  NaN    NaN                                 NaN  
1293        S  NaN    NaN                                 NaN  
1294        S  NaN   14.0                                 NaN  
1295        S  NaN    NaN                                 NaN  
1296        S  NaN  131.0                                 NaN  
1297        S  NaN    NaN                                 NaN  
1298        S  NaN    NaN                                 NaN  
1299        C    C    NaN                                 NaN  
1300        C  NaN    NaN                                 NaN  
1301        C  NaN  312.0                                 NaN  
1302        C  NaN    NaN                                 NaN  
1303        C  NaN    NaN                                 NaN  
1304        C  NaN  328.0                                 NaN  
1305        C  NaN    NaN                                 NaN  
1306        C  NaN  304.0                                 NaN  
1307        C  NaN    NaN                                 NaN  
1308        S  NaN    NaN                                 NaN  

[1309 rows x 14 columns]>

In [8]:
'''
To describe the INTENDED values and types of the data we will show you the titanic.html file that was provided to us
Notice:
    - 'age' is of type double, so someone can be 17.5 years old, mostly used with babies that are 0.x years old
    - 'cabin' is stored as integer, but it har characters and letters
    - By this model, embarked is stored as an integer, witch has to be interpreted as the 3 different embarkation ports
    - It says that 'boat' is stored as a integer even though it has spaces and letters, it should be stored as string
    
PS: it might be that the information stored as integer is supposed to be categorical data,
        ...because they have a "small" amount of valid options
''' 

# Display html info in Jupyter Notebook
from IPython.core.display import display, HTML
htmlFile = 'Data/titanic.html'
display(HTML(htmlFile))



Data frame:titanic3

1309 observations and 14 variables, maximum # NAs:1188
NameLabelsUnitsLevelsStorageNAs
pclass
3
integer
0
survivedSurvived
double
0
nameName
character
0
sex
2
integer
0
ageAgeYear
double
263
sibspNumber of Siblings/Spouses Aboard
double
0
parchNumber of Parents/Children Aboard
double
0
ticketTicket Number
character
0
farePassenger FareBritish Pound (\243)
double
1
cabin
187
integer
0
embarked
3
integer
2
boat
28
integer
0
bodyBody Identification Number
double
1188
home.destHome/Destination
character
0

VariableLevels
pclass1st
2nd
3rd
sexfemale
male
cabin
A10
A11
A14
A16
A18
A19
A20
A21
A23
A24
A26
A29
A31
A32
A34
A36
A5
A6
A7
A9
B10
B101
B102
B11
B18
B19
B20
B22
B24
B26
B28
B3
B30
B35
B36
B37
B38
B39
B4
B41
B42
B45
B49
B5
B50
B51 B53 B55
B52 B54 B56
B57 B59 B63 B66
B58 B60
B61
B69
B71
B73
B77
B78
B79
B80
B82 B84
B86
B94
B96 B98
C101
C103
C104
C105
C106
C110
C111
C116
C118
C123
C124
C125
C126
C128
C130
C132
C148
C2
C22 C26
C23 C25 C27
C28
C30
C31
C32
C39
C45
C46
C47
C49
C50
C51
C52
C53
C54
C55 C57
C6
C62 C64
C65
C68
C7
C70
C78
C80
C82
C83
C85
C86
C87
C89
C90
C91
C92
C93
C95
C97
C99
D
D10 D12
D11
D15
D17
D19
D20
D21
D22
D26
D28
D30
D33
D34
D35
D36
D37
D38
D40
D43
D45
D46
D47
D48
D49
D50
D56
D6
D7
D9
E10
E101
E12
E121
E17
E24
E25
E31
E33
E34
E36
E38
E39 E41
E40
E44
E45
E46
E49
E50
E52
E58
E60
E63
E67
E68
E77
E8
F
F E46
F E57
F E69
F G63
F G73
F2
F33
F38
F4
G6
T
embarkedCherbourg
Queenstown
Southampton
boat
1
10
11
12
13
13 15
13 15 B
14
15
15 16
16
2
3
4
5
5 7
5 9
6
7
8
8 10
9
A
B
C
C D
D


In [9]:
''' 
The default types of the data after import:
Notice:
    - the strings and characters are imported as objects
    - 'survived' is imported as int instead of double (which is in our opinion better since it's only 0 and 1
    - 'sex' is imported as object not integer because it is a string
'''

titanic.dtypes


Out[9]:
pclass         int64
survived       int64
name          object
sex           object
age          float64
sibsp          int64
parch          int64
ticket        object
fare         float64
cabin         object
embarked      object
boat          object
body         float64
home.dest     object
dtype: object

In [10]:
''' 
Below you can see the value range of the different numerical values.

name, sex, ticket, cabin, embarked, boat and home.dest is not included because they can't be quantified numerically.
''' 

titanic.describe()


Out[10]:
pclass survived age sibsp parch fare body
count 1309.000000 1309.000000 1046.000000 1309.000000 1309.000000 1308.000000 121.000000
mean 2.294882 0.381971 29.881135 0.498854 0.385027 33.295479 160.809917
std 0.837836 0.486055 14.413500 1.041658 0.865560 51.758668 97.696922
min 1.000000 0.000000 0.166700 0.000000 0.000000 0.000000 1.000000
25% 2.000000 0.000000 21.000000 0.000000 0.000000 7.895800 72.000000
50% 3.000000 0.000000 28.000000 0.000000 0.000000 14.454200 155.000000
75% 3.000000 1.000000 39.000000 1.000000 0.000000 31.275000 256.000000
max 3.000000 1.000000 80.000000 8.000000 9.000000 512.329200 328.000000

In [11]:
'''
Additional information that is important to remember when manipulation the data
is if/where there are NaN values in the dataset
'''

# This displays the number of NaN there is in different attributes
print(pd.isnull(titanic).sum())

'''
Some of this data is missing while some is meant to describe 'No' or something of meaning.
Example:
    Cabin has 1014 NaN in its column, it might be that every passenger had a cabin and the data is missing.
    Or it could mean that most passengers did not have a cabin or a mix. The displayed titanic.html file 
    give us some insight if it is correct. It says that there are 0 NaN in the column. This indicates that
    there are 1014 people without a cabin. Boat has also 823 NaN's, while the titanic lists 0 NaN's. 
    It is probably because most of those who died probably weren't in a boat.
'''


pclass          0
survived        0
name            0
sex             0
age           263
sibsp           0
parch           0
ticket          0
fare            1
cabin        1014
embarked        2
boat          823
body         1188
home.dest     564
dtype: int64
Out[11]:
"\nSome of this data is missing while some is meant to describe 'No' or something of meaning.\nExample:\n    Cabin has 1014 NaN in its column, it might be that every passenger had a cabin and the data is missing.\n    Or it could mean that most passengers did not have a cabin or a mix. The displayed titanic.html file \n    give us some insight if it is correct. It says that there are 0 NaN in the column. This indicates that\n    there are 1014 people without a cabin. Boat has also 823 NaN's, while the titanic lists 0 NaN's. \n    It is probably because most of those who died probably weren't in a boat.\n"

In [12]:
'''
What attributes should be stored as categorical information?

Categorical data is essentially 8-bit integers which means it can store up to 2^8 = 256 categories
Benefit is that it makes memory usage lower and it has a performance increase in calculations.
'''

print('Number of unique values in... :')
for attr in titanic:
    print("   {attr}: {u}".format(attr=attr, u=len(titanic[attr].unique())))


Number of unique values in... :
   pclass: 3
   survived: 2
   name: 1307
   sex: 2
   age: 99
   sibsp: 7
   parch: 8
   ticket: 939
   fare: 282
   cabin: 187
   embarked: 4
   boat: 29
   body: 122
   home.dest: 370

In [13]:
'''
We think it will be smart to categorize: 'pclass', 'survived', 'sex', 'cabin', 'embarked' and 'boat'
because they have under 256 categories and don't have a strong numerical value like 'age'
'survived' is a bordercase because it might be more practical to work with integers in some settings
'''

# changing the attributes to categorical data
titanic.pclass = titanic.pclass.astype('category')
titanic.survived = titanic.survived.astype('category')
titanic.sex = titanic.sex.astype('category')
titanic.cabin = titanic.cabin.astype('category')
titanic.embarked = titanic.embarked.astype('category')
titanic.boat = titanic.boat.astype('category')

#Illustrate the change by printing out the new types
titanic.dtypes


Out[13]:
pclass       category
survived     category
name           object
sex          category
age           float64
sibsp           int64
parch           int64
ticket         object
fare          float64
cabin        category
embarked     category
boat         category
body          float64
home.dest      object
dtype: object

Question 3.2

"Plot histograms for the travel class, embarkation port, sex and age attributes. For the latter one, use discrete decade intervals. "

assumptions:


In [14]:
#Plotting the ratio different classes(1st, 2nd and 3rd class) the passengers have
pc = titanic.pclass.value_counts().sort_index().plot(kind='bar')
pc.set_title('Travel classes')
pc.set_ylabel('Number of passengers')
pc.set_xlabel('Travel class')
pc.set_xticklabels(('1st class', '2nd class', '3rd class'))
plt.show(pc)

#Plotting the amount of people that embarked from different cities(C=Cherbourg, Q=Queenstown, S=Southampton)
em = titanic.embarked.value_counts().sort_index().plot(kind='bar')
em.set_title('Ports of embarkation')
em.set_ylabel('Number of passengers')
em.set_xlabel('Port of embarkation')
em.set_xticklabels(('Cherbourg', 'Queenstown', 'Southampton'))
plt.show(em)

#Plotting what sex the passengers are
sex = titanic.sex.value_counts().plot(kind='bar')
sex.set_title('Gender of the passengers')
sex.set_ylabel('Number of Passengers')
sex.set_xlabel('Gender')
sex.set_xticklabels(('Female', 'Male'))
plt.show(sex)

#Plotting agegroup of passengers
bins = [0,10,20,30,40,50,60,70,80]
age_grouped = pd.DataFrame(pd.cut(titanic.age, bins))
ag = age_grouped.age.value_counts().sort_index().plot.bar()
ag.set_title('Age of Passengers ')
ag.set_ylabel('Number of passengers')
ag.set_xlabel('Age groups')
plt.show(ag)


Question 3.3

Calculate the proportion of passengers by cabin floor. Present your results in a pie chart.

assumptions:

  • Because we are tasked with categorizing persons by the floor of their cabin it was problematic that you had cabin input: "F E57" and "F G63". There were only 7 of these instances with conflicting cabinfloors. We also presumed that the was a floor "T". Even though there was only one instance, so it might have been a typo.
  • We assume that you don't want to include people without cabinfloor

In [15]:
'''
Parsing the cabinfloor, into floors A, B, C, D, E, F, G, T and display in a pie chart

'''
#Dropping NaN (People without cabin)
cabin_floors = titanic.cabin.dropna()

# removes digits and spaces
cabin_floors = cabin_floors.str.replace(r'[\d ]+', '')
# removes duplicate letters and leave unique (CC -> C) (FG -> G)
cabin_floors = cabin_floors.str.replace(r'(.)(?=.*\1)', '')
# removes ambigous data from the dataset (FE -> NaN)(FG -> NaN)
cabin_floors = cabin_floors.str.replace(r'([A-Z]{1})\w+', 'NaN' )

# Recategorizing (Since we altered the entries, we messed with the categories)
cabin_floors = cabin_floors.astype('category')
# Removing NaN (uin this case ambigous data)
cabin_floors = cabin_floors.cat.remove_categories('NaN')
cabin_floors = cabin_floors.dropna()

# Preparing data for plt.pie
numberOfCabinPlaces = cabin_floors.count()
grouped = cabin_floors.groupby(cabin_floors).count()
sizes = np.array(grouped)
labels = np.array(grouped.index)

# Plotting the pie chart
plt.pie(sizes, labels=labels, autopct='%1.1f%%', pctdistance=0.75, labeldistance=1.1)
print("There are {cabin} passengers that have cabins and {nocabin} passengers without a cabin"
      .format(cabin=numberOfCabinPlaces, nocabin=(len(titanic) - numberOfCabinPlaces)))


There are 288 passengers that have cabins and 1021 passengers without a cabin

Question 3.4

For each travel class, calculate the proportion of the passengers that survived. Present your results in pie charts.

assumptions:


In [16]:
# function that returns the number of people that survived and died given a specific travelclass
def survivedPerClass(pclass):
    survived = len(titanic.survived[titanic.survived == 1][titanic.pclass == pclass])
    died = len(titanic.survived[titanic.survived == 0][titanic.pclass == pclass])
    return [survived, died]

# Fixing the layout horizontal
the_grid = plt.GridSpec(1, 3)
labels = ["Survived", "Died"]

# Each iteration plots a pie chart
for p in titanic.pclass.unique():
    sizes = survivedPerClass(p)
    plt.subplot(the_grid[0, p-1], aspect=1 )
    plt.pie(sizes, labels=labels, autopct='%1.1f%%')
    
plt.show()


Question 3.5

"Calculate the proportion of the passengers that survived by travel class and sex. Present your results in a single histogram."

assumptions:

1. By "proportions" We assume it is a likelyhood-percentage of surviving

In [23]:
# group by selected data and get a count for each category
survivalrate = titanic.groupby(['pclass', 'sex', 'survived']).size()

# calculate percentage
survivalpercentage = survivalrate.groupby(level=['pclass', 'sex']).apply(lambda x: x / x.sum() * 100)

# plotting in a histogram
histogram = survivalpercentage.filter(like='1', axis=0).plot(kind='bar')
histogram.set_title('Proportion of the passengers that survived by travel class and sex')
histogram.set_ylabel('Percent likelyhood of surviving titanic')
histogram.set_xlabel('class/gender group')
plt.show(histogram)


Question 3.6

"Create 2 equally populated age categories and calculate survival proportions by age category, travel class and sex. Present your results in a DataFrame with unique index."

assumptions:

  1. By "proportions" we assume it is a likelyhood-percentage of surviving
  2. To create 2 equally populated age categories; we will find the median and round up from the median to nearest whole year difference before splitting.

In [24]:
#drop NaN rows
age_without_nan = titanic.age.dropna()

#categorizing
age_categories = pd.qcut(age_without_nan, 2, labels=["Younger", "Older"])

#Numbers to explain difference
median = int(np.float64(age_without_nan.median()))
amount = int(age_without_nan[median])
print("The Median age is {median} years old".format(median = median))
print("and there are {amount} passengers that are {median} year old \n".format(amount=amount, median=median))

print(age_categories.groupby(age_categories).count())
print("\nAs you can see the pd.qcut does not cut into entirely equal sized bins, because the age is of a discreet nature")


The Median age is 28 years old
and there are 35 passengers that are 28 year old 

age
Older      510
Younger    536
Name: age, dtype: int64

As you can see the pd.qcut does not cut into entirely equal sized bins, because the age is of a discreet nature

In [25]:
# imported for the sake of surpressing some warnings
import warnings
warnings.filterwarnings('ignore')

# extract relevant attributes
csas = titanic[['pclass', 'sex', 'age', 'survived']]
csas.dropna(subset=['age'], inplace=True)

# Defining the categories
csas['age_group'] = csas.age > csas.age.median()
csas['age_group'] = csas['age_group'].map(lambda age_category: 'older' if age_category else "younger")

# Converting to int to make it able to aggregate and give percentage
csas.survived = csas.survived.astype(int)

g_categories = csas.groupby(['pclass', 'age_group', 'sex'])
result = pd.DataFrame(g_categories.survived.mean()).rename(columns={'survived': 'survived proportion'})

# reset current index and spesify the unique index
result.reset_index(inplace=True)
unique_index = result.pclass.astype(str) + ': ' + result.age_group.astype(str) + ' ' + result.sex.astype(str)

# Finalize the unique index dataframe
result_w_unique = result[['survived proportion']]
result_w_unique.set_index(unique_index, inplace=True)
print(result_w_unique)


                   survived proportion
1: older female               0.967391
1: older male                 0.302521
1: younger female             0.951220
1: younger male               0.531250
2: older female               0.872340
2: older male                 0.083333
2: younger female             0.910714
2: younger male               0.216216
3: older female               0.372093
3: older male                 0.144000
3: younger female             0.513761
3: younger male               0.183036